### supplemental material to:
### Michael Kloster, Daniel Langenkämper, Martin Zurowietz, Bánk Beszteri, Tim W. Nattkemper (2020) 
### Deep learning-based diatom taxonomy on virtual slides


### please execute script 01 prior to this one!!!


### helper functions for preparing data etc.

## required libraries
require(keras)
library(caret)
library(e1071)

## functions

# split data into training-, validation- and test-set, with a fixed proportion for each class
splitDLData <- function (
  data, # annotation data to split
  portionTrain, # portion of annotations to be used a training data [0:1]
  portionValidate = NULL, #  portion of annotations to be used a validaton data [0:1]. If NULL, then take the rest for validation and do not produce training data
  seed = NULL # optional seed for initialization of random sampling
  )
{
  # if requested, initialize random number generator 
  if(!is.null(seed))
  {
    set.seed(seed)
  }
  
  # generate empty data.frame to store sets
  data.train <- data[data$AnnotationId<0,] # empty data.frame for storing training data
  data.validate <- data.train # empty data.frame for storing validation data
  data.test <- data.train # empty data.frame for storing test data
  
  # iterate over classes to prepare data for each class
  classIds <- unique(data$ClassId)
  for(classId in classIds)
  {
    class.Data <- data[data$ClassId == classId,] # get all data for the current class
    class.IDs <- 1:nrow(class.Data) # list of consecutive indices of the classes data, used to split the data into the three sets
    
    # get indices from the current class for training set samples
    indices.Train <- sample(class.IDs, nrow(class.Data)*portionTrain)
    
    # get indices from the current class for validation set samples
    class.IDs <- setdiff(class.IDs, indices.Train) # remove training samples, since they are already used
    
    if(is.null(portionValidate)) # portion of validation not supplied, so make it the rest (=no training data)
    {
      indices.Validate <- class.IDs # rest ist for validation
      indices.Test <- vector() # no test data
    } else     {
      indices.Validate <- sample(class.IDs, nrow(class.Data)*portionValidate) # get indices of validation set samples
      
      # the remaining samples are the test set    
      indices.Test <- setdiff(class.IDs, indices.Validate)
    }
    
    # add samples from the current class to the overall training-, validation- and test-sets
    data.train <- rbind(data.train,class.Data[indices.Train,])
    data.validate <- rbind(data.validate,class.Data[indices.Validate,])
    data.test <- rbind(data.test,class.Data[indices.Test,])

  }
  
  # shuffle data
  data.train <- data.train[sample(nrow(data.train)),]
  data.validate <- data.validate[sample(nrow(data.validate)),]
  data.test <- data.test[sample(nrow(data.test)),]
  
  # return training-, validation- and test-set  
  data.sets <- list("train" = data.train, "validate" = data.validate, "test" = data.test)
  return (data.sets)
}


# split data into training-, validation- and test-set, with a fixed proportion for each class, for k-fold cross-validation experiments
splitDLDataKFold <- function (
  data, # annotation data to split
  folds, # number of folds for splitting
  seed = NULL # optional seed for initialization of random sampling
  ) 
{
  # if requested, initialize random number generator 
  if(!is.null(seed))
  {
    set.seed(seed)
  }
  
  # generate empty data.frame to store sets
  data.withK <- NULL # empty data.frame for storing training data
  
  # iterate over classes to prepare data for each class
  classIds <- unique(data$ClassId)
  for(classId in classIds)
  {
    class.Data <- data[data$ClassId == classId,] # get all data for the current class
    class.Indices <- 1:nrow(class.Data) # list of consecutive indices of the classes data, used to split the data into the three sets
    
    # check if enough speciemens are available to put at each one of them into each fold
    if(nrow(class.Data) < folds)
    {
      warning(paste("Number of folds exceeds number of specimens for class: ", classId, " '", class.Data[1,]$ClassName,"'"), immediate. = TRUE)
    }
    
    # divide specimens of this class into folds
    fold.indices <- sample(class.Indices)
    class.Data$k <- cut(fold.indices, breaks=folds, labels=FALSE)
    
    # add data of this class to the k folds
    if(is.null(data.withK))
    {
      data.withK = class.Data
    } 
    else 
    {
      data.withK = rbind(data.withK, class.Data)
    }
  }
  
  # return data extended by k, as index of the fold
  return (data.withK)
}


# generate statistics (number of speciments per class) about the data sets (training, validation, test)
dataDLStatistics <- function(
  dataDL, # the data set to analyze
  sumUp = FALSE # flag indicating if class counts should be summed up to the total number for all classes
  )
{
  # create statistics
  statistics <- data.frame("ClassId" = unique(dataDL$ClassId)) # create data frame with one row for each class 
  for(i in 1:nrow(statistics)) # iterate over classes in this data frame to fill it with speciment counts
  {
    currentClassId <- statistics$ClassId[i] # get class id
    currentClassName <- dataDL[dataDL$ClassId == currentClassId, "ClassName"][1] # get class name
    statistics$ClassName[i] <- as.character(currentClassName) # store class name into data frame
    statistics$Count[i] <- nrow(dataDL[dataDL$ClassId == currentClassId,]) # get total count of speciments of this class
    statistics$Count.ANTXXVIII2[i] <- nrow(dataDL[dataDL$ClassId == currentClassId & dataDL$Cruise == "ANT-XXVIII/2",]) # get count of speciments of this class for expedition ANT-XXVIII/2
    statistics$Count.PS103[i] <- nrow(dataDL[dataDL$ClassId == currentClassId & dataDL$Cruise == "PS103",]) # get count of speciments of this class for expedition PS103
  }
  
  statistics <- statistics[order(statistics$ClassId),] # sort statistics by class id
  
  # sum up counts of all classes to total count if requested
  if(sumUp)
  {
    counts=data.frame("ClassId" = NA, "ClassName" = "Sum", 
                      "Count" = sum(statistics$Count),  
                      "Count.ANTXXVIII2" = sum(statistics$Count.ANTXXVIII2),
                      "Count.PS103" = sum(statistics$Count.PS103))
    
    statistics <- rbind(statistics,counts)
  }
  
  # return data frame with counts of specimens per class and expedition
  return (statistics)
}

